package com.saibo.spider.gather;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.http.client.methods.CloseableHttpResponse;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.saibo.spider.commons.SubmitRedisCommons;
import com.saibo.spider.pojo.BaiduNewsInfo;
import com.saibo.spider.utils.EncodeUtil;
import com.saibo.spider.utils.HttpRequestUtil;
import com.saibo.spider.utils.HttpResponseUtil;
import com.saibo.spider.utils.MD5Util;
import com.saibo.spider.utils.StringUtil;

public class MouldBaiduNews {

	String url = "";
	String referer = "";
	String nextPage = "";

	// http://news.baidu.com/ns?ct=1&rn=20&ie=utf-8&bs=%E5%AD%A4%E8%8A%B3%E4%B8%8D%E8%87%AA%E8%B5%8F&rsv_bp=1&sr=0&cl=2&f=8&prevct=no&tn=newstitle&word=%E5%AD%A4%E8%8A%B3%E4%B8%8D%E8%87%AA%E8%B5%8F
	public static void main(String[] args) throws Exception {
		MouldBaiduNews s = new MouldBaiduNews("孤芳不自赏");
		String nextPage = s.connect(s.url);
		while (true) {
			s.referer = s.url;
			s.url = nextPage;
			nextPage = s.connect(s.url);
			if (StringUtil.isNullOrEmpty(nextPage)) {
				break;
			}
			if(nextPage.equals("1")){
				for (int i = 0; i < 3; i++) {
					System.out.println("重新请求:"+i+1+"次");
					nextPage = s.connect(s.url);
					if(!StringUtil.isNullOrEmpty(nextPage)&&!"1".equals(nextPage))break;
				}
			}
			Thread.sleep(15 * 1000);
		}
	}

	public MouldBaiduNews(String targetWorld) {
		this.url = "http://news.baidu.com/ns?ct=1&rn=20&ie=utf-8&bs=" + EncodeUtil.encodeURL(targetWorld, "UTF-8")
				+ "&rsv_bp=1&sr=0&cl=2&f=8&prevct=no&tn=newstitle&word=" + EncodeUtil.encodeURL(targetWorld, "UTF-8");
	}

	public String connect(String url) throws ParseException {
		CloseableHttpResponse httpResponse = HttpRequestUtil.getRequestMethod(url, requestHeader(referer));
		String html = HttpResponseUtil.getHtml(httpResponse);
		if(StringUtil.isNullOrEmpty(html)){
			return "1";
		}
		Document document = Jsoup.parse(html);
		Elements element_list = document.select("#content_left>div>div.result");
		List<BaiduNewsInfo> lb = new ArrayList<BaiduNewsInfo>();
		for (Element element : element_list) {
			BaiduNewsInfo baidu = new BaiduNewsInfo();
			String title = StringUtil.element2text(element.select("div>h3.c-title>a").first());
			String newsurl = element.select("div>h3.c-title>a").attr("href");
			String msg = StringUtil.element2text(element.select("div>div.c-title-author").first());
			msg = msg.replaceAll("&gt;", ">");
			
			msg = msg.replaceAll("年", "-");
			msg = msg.replaceAll("月", "-");
			msg = msg.replaceAll("日", "");
			String[] msgCache = msg.split(" ");
			if (msg.contains("条相同新闻")) {
				System.err.print(StringUtil.element2text(element.select("div>h3.c-title>a").first()) + "\t"
						+ element.select("div>h3.c-title>a").attr("href") + "\t");
				System.err.println(msg);
			} else {
				System.out.print(StringUtil.element2text(element.select("div>h3.c-title>a").first()) + "\t"
						+ element.select("div>h3.c-title>a").attr("href") + "\t");
				System.out.println(msg);
			}

			baidu.setRowkey(MD5Util.GET16BITMD5(newsurl));
			baidu.setTitle(title);
			baidu.setUrl(newsurl);
			baidu.setComefrom(msgCache[0]);
			SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
			Date date1 = sdf.parse(msgCache[1] + " " + msgCache[2] + ":00");
			long sTime = date1.getTime();
			baidu.setCreate_time(new Date(sTime));
			baidu.setCrawler_time(new Date(System.currentTimeMillis()));
			lb.add(baidu);
		}
		SubmitRedisCommons.submitRedisData("baidu_news_info", StringUtil.objectToJson(lb));
		Elements nextPageElement = document.select("#page>a.n");

		for (Element element : nextPageElement) {
			String pageText = StringUtil.element2text(element);
			pageText = pageText.replaceAll("&gt;", ">");
			System.out.println(pageText);
			if ("下一页>".equals(pageText)) {
				System.out.println(element.attr("href"));
				return "http://news.baidu.com" + element.attr("href");
			}
		}
		return null;
	}

	public Map<String, String> requestHeader(String referer) {
		Map<String, String> requestHeader = new HashMap<String, String>();
		requestHeader.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
		requestHeader.put("Accept-Encoding", "gzip, deflate, sdch");
		requestHeader.put("Accept-Language", "zh-CN,zh;q=0.8");
		requestHeader.put("Cache-Control", "max-age=0");
		requestHeader.put("Connection", "keep-alive");
		requestHeader.put("Cookie",
				"BIDUPSID=15939BF047EFF64AA9A228E5964E5876; PSTM=1479486389; BAIDUID=15939BF047EFF64AA9A228E5964E5876:FG=1; __cfduid=dc91fbd4656b354c1896b7286e1eccc571481182804; ispeed_lsm=2; BDUSS=o0N29QTlppdnpOfkh5fmd4VG9Ifn5rbHpUUXNzREZ1UWNlRjRDUlR-dmVFWWhZSVFBQUFBJCQAAAAAAAAAAAEAAADXcw4NvsC94bXEtbDM209yegAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAN6EYFjehGBYZX; MCITY=-131%3A; BD_HOME=1; H_PS_PSSID=1463_21082_17001_20697_21673_20719; BD_UPN=12314753");
		requestHeader.put("Host", "news.baidu.com");
		if (!StringUtil.isNullOrEmpty(referer))
			requestHeader.put("Referer", referer);
		requestHeader.put("Upgrade-Insecure-Requests", "1");
		requestHeader.put("User-Agent",
				"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
		return requestHeader;
	}
}
